In [1]:
import numpy as np
import pandas as pd
import os
import glob
In [2]:
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
In [3]:
pip install chart_studio
Requirement already satisfied: chart_studio in ./opt/anaconda3/lib/python3.9/site-packages (1.1.0)
Requirement already satisfied: six in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (1.16.0)
Requirement already satisfied: requests in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (2.27.1)
Requirement already satisfied: plotly in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (5.6.0)
Requirement already satisfied: retrying>=1.3.3 in ./opt/anaconda3/lib/python3.9/site-packages (from chart_studio) (1.3.3)
Requirement already satisfied: tenacity>=6.2.0 in ./opt/anaconda3/lib/python3.9/site-packages (from plotly->chart_studio) (8.0.1)
Requirement already satisfied: certifi>=2017.4.17 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (1.26.9)
Requirement already satisfied: idna<4,>=2.5 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (3.3)
Requirement already satisfied: charset-normalizer~=2.0.0 in ./opt/anaconda3/lib/python3.9/site-packages (from requests->chart_studio) (2.0.4)
Note: you may need to restart the kernel to use updated packages.
In [4]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.offline as pyoff
In [5]:
#files = os.path.join("/Users/hassaniftikhar4472/Downloads/data/cloud_to_local", "*.csv")
#files = glob.glob(files)
#df = pd.concat(map(pd.read_csv, files), ignore_index=True)
#df.columns =['User_ID', 'Date', 'playtime','mnisimp', 'mnrvimp', 'mnisrev', 'mnrvrev', 'mnadrev', 'mniaprev','revtotal']
#df.drop(columns = ['mnrvimp','mnrvrev','mnadrev','mniaprev','revtotal','playtime'], axis=1, inplace=True)
#df['Date'] = pd.to_datetime(df['Date'])
#df.head()
In [6]:
# saving the dataframe
df = pd.read_csv('/Users/hassaniftikhar4472/Downloads/data/data_3_months.csv')
In [7]:
tx_data = df
In [8]:
tx_data['Date'] = pd.to_datetime(df['Date'])
In [9]:
#creating YearMonth field for the ease of reporting and visualization
tx_data['Date'] = tx_data['Date'].map(lambda date: 100*date.month + date.day)

#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
tx_data['Revenue'] = tx_data['mnisrev']
tx_revenue = tx_data.groupby(['Date'])['Revenue'].sum().reset_index()
tx_revenue
Out[9]:
Date Revenue
0 801 18103.157642
1 802 17830.293629
2 803 17963.880245
3 804 18013.373888
4 805 17879.153577
... ... ...
64 1004 15850.369094
65 1005 15669.182535
66 1006 15369.523638
67 1007 15243.490637
68 1008 15383.735483

69 rows × 2 columns

In [10]:
#X and Y axis inputs for Plotly graph. We use Scatter for line graphs
plot_data = [
    go.Scatter(
        x=tx_revenue['Date'],
        y=tx_revenue['Revenue'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Montly Revenue'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
In [11]:
pyoff.iplot(fig)
In [12]:
#using pct_change() function to see monthly percentage change
tx_revenue['MonthlyGrowth'] = tx_revenue['Revenue'].pct_change()

#showing first 5 rows
tx_revenue.head()
#visualization - line graph
plot_data = [
    go.Scatter(
        x=tx_revenue.query("Date < 1008")['Date'],
        y=tx_revenue.query("Date < 1008")['MonthlyGrowth'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Montly Growth Rate'
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [13]:
#creating monthly active customers dataframe by counting unique Customer IDs
tx_monthly_active = tx_data.groupby('Date')['User_ID'].nunique().reset_index()

#plotting the output
plot_data = [
    go.Bar(
        x=tx_monthly_active['Date'],
        y=tx_monthly_active['User_ID'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Active Users'
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [14]:
#create a new dataframe for no. of INT by using quantity field
tx_monthly_sales = tx_data.groupby('Date')['mnisimp'].sum().reset_index()


#plot
plot_data = [
    go.Bar(
        x=tx_monthly_sales['Date'],
        y=tx_monthly_sales['mnisimp'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Total # INT'
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [15]:
# create a new dataframe for average revenue by taking the mean of it
tx_monthly_order_avg = tx_data.groupby('Date')['Revenue'].mean().reset_index()

#print the dataframe
tx_monthly_order_avg

#plot the bar chart
plot_data = [
    go.Bar(
        x=tx_monthly_order_avg['Date'],
        y=tx_monthly_order_avg['Revenue'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly INT Average'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [16]:
tx_uk = tx_data
#create a dataframe contaning CustomerID and first purchase date
tx_min_purchase = tx_uk.groupby('User_ID').Date.min().reset_index()
tx_min_purchase.columns = ['User_ID','MinINTDate']
#tx_min_purchase['MinPurchaseYearMonth'] = tx_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)

#merge first purchase date column to our main dataframe (tx_uk)
tx_uk = pd.merge(tx_uk, tx_min_purchase, on='User_ID')

print(tx_uk.head())

#create a column called User Type and assign Existing 
#if User's First Purchase Year Month before the selected Invoice Year Month
tx_uk['UserType'] = 'New'
tx_uk.loc[tx_uk['Date']>tx_uk['MinINTDate'],'UserType'] = 'Existing'

#calculate the Revenue per month for each user type
tx_user_type_revenue = tx_uk.groupby(['Date','UserType'])['Revenue'].sum().reset_index()
                                User_ID  Date  playtime  mnisimp   mnisrev  \
0  0fb1ba8c-e46d-4b08-9ace-f0ea12af0e01   912         0        0  0.000000   
1  0fb1ba8c-e46d-4b08-9ace-f0ea12af0e01   909         0        0  0.000000   
2  0fb1ba8c-e46d-4b08-9ace-f0ea12af0e01   910         0        0  0.000000   
3  ed17d241-fa4c-4de3-b3f3-b22d97c0c29b   912         0        0  0.000000   
4  ed17d241-fa4c-4de3-b3f3-b22d97c0c29b   905         0        1  0.002967   

    Revenue  MinINTDate  
0  0.000000         909  
1  0.000000         909  
2  0.000000         909  
3  0.000000         903  
4  0.002967         903  
In [17]:
tx_user_type_revenue
Out[17]:
Date UserType Revenue
0 801 New 18050.743430
1 802 Existing 11616.243356
2 802 New 6161.086208
3 803 Existing 12979.809498
4 803 New 4933.343764
... ... ... ...
116 928 New 2214.006095
117 929 Existing 13980.201803
118 929 New 2202.078237
119 930 Existing 13863.189135
120 930 New 2158.099729

121 rows × 3 columns

In [18]:
#filtering the dates and plot the result
tx_user_type_revenue = tx_user_type_revenue.query("Date != 801 and Date != 1008 and Date != 1007 and Date != 1006 and Date != 1005 and Date != 1004 and Date != 1003 and Date != 1002 and Date != 1001")
plot_data = [
    go.Scatter(
        x=tx_user_type_revenue.query("UserType == 'Existing'")['Date'],
        y=tx_user_type_revenue.query("UserType == 'Existing'")['Revenue'],
        name = 'Existing'
    ),
    go.Scatter(
        x=tx_user_type_revenue.query("UserType == 'New'")['Date'],
        y=tx_user_type_revenue.query("UserType == 'New'")['Revenue'],
        name = 'New'
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='New vs Existing'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [19]:
#create a dataframe that shows new user ratio - we also need to drop NA values 
tx_user_ratio = tx_uk.query("UserType == 'New'").groupby(['Date'])['User_ID'].nunique()/tx_uk.query("UserType == 'Existing'").groupby(['Date'])['User_ID'].nunique() 
tx_user_ratio = tx_user_ratio.reset_index()
tx_user_ratio = tx_user_ratio.dropna()

#print the dafaframe
tx_user_ratio

#plot the result

plot_data = [
    go.Bar(
        x=tx_user_ratio.query("Date>802 and Date<1001")['Date'],
        y=tx_user_ratio.query("Date>802 and Date<1001")['User_ID'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='New Customer Ratio'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [20]:
tx_user_ratio
Out[20]:
Date User_ID
1 802 0.779520
2 803 0.499322
3 804 0.402941
4 805 0.363951
5 806 0.345727
... ... ...
64 1004 0.135245
65 1005 0.135044
66 1006 0.129226
67 1007 0.132373
68 1008 0.144850

68 rows × 2 columns

In [21]:
#identify which users are active by looking at their revenue per month
tx_user_purchase = tx_uk.groupby(['User_ID','Date'])['Revenue'].sum().reset_index()

#create retention matrix with crosstab
tx_retention = pd.crosstab(tx_user_purchase['User_ID'], tx_user_purchase['Date']).reset_index()

tx_retention.head()

#create an array of dictionary which keeps Retained & Total User count for each month
months = tx_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
    retention_data = {}
    selected_month = months[i+1]
    prev_month = months[i]
    retention_data['Date'] = int(selected_month)
    retention_data['TotalUserCount'] = tx_retention[selected_month].sum()
    retention_data['RetainedUserCount'] = tx_retention[(tx_retention[selected_month]>0) & (tx_retention[prev_month]>0)][selected_month].sum()
    retention_array.append(retention_data)
    
#convert the array to dataframe and calculate Retention Rate
tx_retention = pd.DataFrame(retention_array)
tx_retention['RetentionRate'] = tx_retention['RetainedUserCount']/tx_retention['TotalUserCount']

#plot the retention rate graph
plot_data = [
    go.Scatter(
        x=tx_retention.query("Date<1001")['Date'],
        y=tx_retention.query("Date<1001")['RetentionRate'],
        name="organic"
    )
    
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Retention Rate'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [22]:
tx_retention
Out[22]:
Date TotalUserCount RetainedUserCount RetentionRate
0 803 814997 460394 0.564903
1 804 819135 460291 0.561923
2 805 823904 457007 0.554685
3 806 818128 444635 0.543479
4 807 843044 446827 0.530016
... ... ... ... ...
62 1004 744263 418285 0.562012
63 1005 742684 418853 0.563972
64 1006 741363 418218 0.564120
65 1007 744697 417250 0.560295
66 1008 743989 404438 0.543607

67 rows × 4 columns